#IMPORT (wrangled) data 
df_raw <- read_csv("data/CLEAN_coded_utterances.csv") 
df_reps <- read_csv("data/CLEAN_coded_utterances.csv") 

#WRANGLE into DF of coded utterances 
#NOT unique utterances, 1 obs for each utterance+detail-code
df_coded <- df_raw %>% 
  #rename and factorize cols
  mutate(
    #UNIQUE IDS
    sid = factor(SID), #unique ID for utterance+detail-code
    pid = factor(PID, levels = c( #define level order so happiness first
      #HAPPINESS-FIRST    
      "bjs827ee1u", "3r2sh20ei", "4728sjuiz","7ACC0B75","92ghd48xe","iurmer289", "s294hoei",
      #SPACE-FIRST    
      "j2719eertu2","lkin27js09b","li832lin23","7382kwtue","E1D39056","8v892iige")),   
    #create unique ID for utterances
    uid = factor(as.numeric(factor(paste(pid,factor(Utterance))))), #construct a unique ID for utterances
    #recode lower case and order based on true task order
    TASK = factor(recode(Condition, "Static"="static", "Interactive"="ixn" )),
    TASK = factor(TASK, levels = c("static", "ixn")), #reorder factor levels
    #rename Notebook as DATASET
    DATASET = factor(recode(Notebook, "Happiness"="happiness", "Space"="space")),
    #create temp dataset order var
    data_order = factor(paste(TASK,"_",DATASET)), #create an order var 
    data_order = recode(data_order, "ixn _ happiness"="space-first",
                                    "ixn _ space"="happiness-first",
                                    "static _ happiness"="happiness-first",
                                    "static _ space"="space-first"),
    utterance = Utterance,
    reps_group = factor(Final_Group),
    reps_all = factor(`All representations`),
    #rename flags
    flag_story = `Dylan Flag Storytelling`,
    flag_correction = `Dylan Flag Correction`,
    flag_simultaneous = `Dylan Flag Simultaneous Characterization`,
    #recode and order TOP LEVEL CODES 
    code_topic = factor(Highlevel),
    code_topic = recode(code_topic, "ANALYSIS PROCESS" = "PROCESS"),
    code_topic = factor(code_topic, levels = c("PROCESS","DATASET","VARIABLE","RELATIONSHIP")),
    code_datatype = factor(`Data Type`),
    code_detail = factor(`Utterance Type`),
    timestamp = adj_timestamp,
    ixn = factor(interaction_used), #was interaction used?
    PNUM = factor(PNUM,levels = c("P6", "P9", "P10", "P2", "P4", "P12","P13", 
                                   "P5", "P7", "P8", "P3", "P1","P11")),
    
    ) %>% 
  dplyr::select(sid,pid,PNUM,uid,TASK,DATASET,timestamp,ixn,code_topic,code_detail,code_datatype,
         flag_story, flag_correction, flag_simultaneous, utterance, reps_group, reps_all, data_order) %>% 
  arrange(data_order)

#REPLACE NA in logicals to FALSE  
df_coded$flag_story[is.na(df_coded$flag_story)] <- FALSE
df_coded$flag_correction[is.na(df_coded$flag_correction)] <- FALSE
df_coded$flag_simultaneous[is.na(df_coded$flag_simultaneous)] <- FALSE



#CALCULATE RELATIVE TASK TIMES
df_time <- df_coded %>% mutate(
  time = hms::as_hms(timestamp)
) %>% group_by(pid, TASK) %>% 
  # dplyr::summarise( .groups="keep",
  mutate(
    task_start = hms::as_hms(min(time)),
    task_end = hms::as_hms(max(time)),
    task_mins = round(difftime(task_end,task_start, units="mins"),1),
    task_second = task_end - task_start,
    relative_time_s = timestamp-task_start,
    relative_time = as.double(relative_time_s)
  ) %>% ungroup() %>% 
  dplyr::select(pid,PNUM, code_topic,code_detail, TASK,DATASET,timestamp,task_start,relative_time_s,relative_time)

There are 742 rows in the df_coded dataset, where each row represents an utterance coding (i.e. utterance + detail code). There are 662 unique utterances. The difference indicates utterances that were dual-coded (i.e. two detail-level codes). No more than two codes were applied to a single utterance. For the purposes of analysis, dual-coded utterances will be treated as two utterances, as they have two distinct (but lexically insepeperable) units of meaning.

1 DATA PROFILE

df_coded%>% summarytools::dfSummary(
             plain.ascii  = FALSE,
             graph.magnif = 0.75,
             style        = "grid",
             tmp.img.dir  = "temp",
             missing.col = FALSE, 
             method = "render"
)

1.0.1 Data Frame Summary

1.0.1.1 df_coded

Dimensions: 742 x 18
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 sid
[factor]
1. 0
2. 1
3. 2
4. 3
5. 4
6. 5
7. 6
8. 7
9. 8
10. 9
[ 732 others ]
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
732 (98.7%)
742
(100.0%)
0
(0.0%)
2 pid
[factor]
1. bjs827ee1u
2. 3r2sh20ei
3. 4728sjuiz
4. 7ACC0B75
5. 92ghd48xe
6. iurmer289
7. s294hoei
8. j2719eertu2
9. lkin27js09b
10. li832lin23
[ 3 others ]
29 ( 3.9%)
103 (13.9%)
43 ( 5.8%)
28 ( 3.8%)
56 ( 7.5%)
87 (11.7%)
88 (11.9%)
82 (11.1%)
48 ( 6.5%)
51 ( 6.9%)
127 (17.1%)
742
(100.0%)
0
(0.0%)
3 PNUM
[factor]
1. P6
2. P9
3. P10
4. P2
5. P4
6. P12
7. P13
8. P5
9. P7
10. P8
[ 3 others ]
29 ( 3.9%)
103 (13.9%)
43 ( 5.8%)
28 ( 3.8%)
56 ( 7.5%)
87 (11.7%)
88 (11.9%)
82 (11.1%)
48 ( 6.5%)
51 ( 6.9%)
127 (17.1%)
742
(100.0%)
0
(0.0%)
4 uid
[factor]
1. 1
2. 2
3. 3
4. 4
5. 5
6. 6
7. 7
8. 8
9. 9
10. 10
[ 652 others ]
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
730 (98.4%)
742
(100.0%)
0
(0.0%)
5 TASK
[factor]
1. static
2. ixn
403 (54.3%)
339 (45.7%)
742
(100.0%)
0
(0.0%)
6 DATASET
[factor]
1. happiness
2. space
431 (58.1%)
311 (41.9%)
742
(100.0%)
0
(0.0%)
7 timestamp
[hms, difftime]
min : 622
med : 2857
max : 6900
units : secs
622 distinct values 742
(100.0%)
0
(0.0%)
8 ixn
[factor]
1. FALSE
2. TRUE
633 (85.3%)
109 (14.7%)
742
(100.0%)
0
(0.0%)
9 code_topic
[factor]
1. PROCESS
2. DATASET
3. VARIABLE
4. RELATIONSHIP
160 (21.6%)
176 (23.7%)
122 (16.4%)
284 (38.3%)
742
(100.0%)
0
(0.0%)
10 code_detail
[factor]
1. data orientation
2. data provenance
3. data size
4. distribution outlier (var
5. distribution range [min,
6. distribution shape [shape
7. distribution variance (sd
8. missing data
9. outlier (relationship)
10. plan of action
[ 8 others ]
16 ( 2.2%)
11 ( 1.5%)
9 ( 1.2%)
9 ( 1.2%)
33 ( 4.4%)
79 (10.6%)
1 ( 0.1%)
76 (10.2%)
20 ( 2.7%)
52 ( 7.0%)
436 (58.8%)
742
(100.0%)
0
(0.0%)
11 code_datatype
[factor]
1. distribution (continuous
2. distribution (categorical
3. relationship (categorical
4. relationship (categorical
5. relationship (continuous
6. relationship (multivariat
76 (17.8%)
54 (12.7%)
28 ( 6.6%)
55 (12.9%)
146 (34.3%)
67 (15.7%)
426
(57.4%)
316
(42.6%)
12 flag_story
[logical]
1. FALSE
2. TRUE
700 (94.3%)
42 ( 5.7%)
742
(100.0%)
0
(0.0%)
13 flag_correction
[logical]
1. FALSE
2. TRUE
733 (98.8%)
9 ( 1.2%)
742
(100.0%)
0
(0.0%)
14 flag_simultaneous
[logical]
1. FALSE
2. TRUE
682 (91.9%)
60 ( 8.1%)
742
(100.0%)
0
(0.0%)
15 utterance
[character]
1. [Talking about the profil
2. actually, let me see if p
3. Although we have like les
4. And are they within range
5. And confidence in governm
6. And just I want to see ho
7. And so it looks like it s
8. And then if I had more ti
9. Because it does seem like
10. Data frame. Got a bunch o
[ 652 others ]
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
722 (97.3%)
742
(100.0%)
0
(0.0%)
16 reps_group
[factor]
1. barplot
2. columns
3. columns_data_dictionary
4. data_dictionary
5. data_dictionary_dataframe
6. data_dictionary_describe
7. dataframe
8. dataframe_describe
9. dataframe_heatmap
10. dataframe_pairplot
[ 15 others ]
16 ( 2.2%)
4 ( 0.5%)
1 ( 0.1%)
56 ( 7.5%)
1 ( 0.1%)
9 ( 1.2%)
76 (10.2%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
576 (77.6%)
742
(100.0%)
0
(0.0%)
17 reps_all
[factor]
1. affect_corruption_brush_7
2. Age_CryoSleep_scatterplot
3. age_CryoSleep_ShoppingMal
4. Age_RoomService_scatterpl
5. age_roomservice_scatterpl
6. Age_RoomService_scatterpl
7. Age_ShoppingMall_scatterp
8. altair_profile_contVars_j
9. alx_barplot_df_homeplanet
10. alx_barplot_df_homeplanet
[ 245 others ]
4 ( 0.6%)
3 ( 0.4%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
693 (97.7%)
709
(95.6%)
33
(4.4%)
18 data_order
[factor]
1. space-first
2. happiness-first
308 (41.5%)
434 (58.5%)
742
(100.0%)
0
(0.0%)

ARF has reviewed data profile for missing data and correct factorization.

2 UTTERANCES

Utterances are the lowest-level discrete units of meaning transcribed from the EDA Task transcripts. Utterances are coded at two levels of analysis: (1) topic-code gives a high level topic of the participant’s verbalization, (2) detail-code gives the lower level detail of the subject.

In the following subsections we explore the distribution of number of utterances based on TASK, DATASET, and PARTICIPANT, before describing the distribution of utterances through the timecourse of the TASK.

2.1 [Aggregated] Utterances

FIRST we explore the distribution of utterances by Analysis Task, Dataset, Participant and Time, irrespective of what the utterance was about (topic, detail).

RQ: How much did participants talk aloud during EDA? When did they talk aloud?

Answer: Inspection of frequency tables and visualizations suggests that the most substantial determinant of how many utterances an individual made is individual participant-level differences, rather than structural differences imposed by the TASK or DATASET. This is not altogether unexpected given the fact that across both tasks (static/interactive) and datasets the structure of the experimental task was the same

2.1.1 by TASK

print("BY TASK")

[1] “BY TASK”

freq(df_coded$TASK, 
     cumul      = FALSE,
     headings   = FALSE,
     report.nas = FALSE,
     plain.ascii = FALSE) 
  Freq %
static 403 54.31
ixn 339 45.69
Total 742 100.00

2.1.2 by TASK and DATASET

#COUNT BY TASK AND DATASET
ctable(x = df_coded$TASK, 
       y = df_coded$DATASET, 
       prop = "t")  

Cross-Tabulation, Total Proportions
TASK * DATASET
Data Frame: df_coded

DATASET happiness space Total
TASK
static 263 (35.4%) 140 (18.9%) 403 ( 54.3%)
ixn 168 (22.6%) 171 (23.0%) 339 ( 45.7%)
Total 431 (58.1%) 311 (41.9%) 742 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>% 
  group_by(TASK,DATASET) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= DATASET)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "Utterances by TASK and DATASET",
        subtitle = "More utterances in STATIC; more utterances in HAPPINESS",
        x= "TASK", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")

2.1.3 by PARTICIPANT

#COUNT BY PARTICIPANT AND TASK
ctable(x = df_coded$PNUM, 
       y = df_coded$TASK, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * TASK
Data Frame: df_coded

TASK static ixn Total
PNUM
P6 11 (37.9%) 18 (62.1%) 29 (100.0%)
P9 63 (61.2%) 40 (38.8%) 103 (100.0%)
P10 30 (69.8%) 13 (30.2%) 43 (100.0%)
P2 18 (64.3%) 10 (35.7%) 28 (100.0%)
P4 35 (62.5%) 21 (37.5%) 56 (100.0%)
P12 46 (52.9%) 41 (47.1%) 87 (100.0%)
P13 60 (68.2%) 28 (31.8%) 88 (100.0%)
P5 33 (40.2%) 49 (59.8%) 82 (100.0%)
P7 29 (60.4%) 19 (39.6%) 48 (100.0%)
P8 17 (33.3%) 34 (66.7%) 51 (100.0%)
P3 24 (44.4%) 30 (55.6%) 54 (100.0%)
P1 10 (40.0%) 15 (60.0%) 25 (100.0%)
P11 27 (56.2%) 21 (43.8%) 48 (100.0%)
Total 403 (54.3%) 339 (45.7%) 742 (100.0%)
#UTTERANCES by PARTICPANT facet TASK
gf_bar( PNUM ~., fill = ~ DATASET, data = df_coded) %>% 
  gf_facet_grid(.~TASK) + 
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "DATASET"
  )

2.1.4 through TIME

#DOTPLOT
ggplot(df_time, aes(x=relative_time, y = PNUM)) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#HISTOGRAMS BY TASK
ggplot(df_time, aes(x = relative_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density..)) + 
  geom_density()+
  facet_grid(df_time$TASK) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
  ) + theme_minimal() + theme(legend.position = "blank")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

2.2 [TOPIC of] Utterances

NEXT we explore the distribution of utterances coded by high level TOPIC, across Analysis Task, Dataset, Participant and Time.

RQ: What kinds of things did participants talk aloud during EDA? Did they progress through any ‘topical phases’ over the course of the task? Or are topics equally distributed across analysis time?

Answer: Inspection of frequency tables and visualizations suggests that:

  1. Individual differences continue to play an important role

  2. There do not appear to be strong TASK/DATASET effects on topic that are consistent across participants.

  3. PROCESS and RELATIONSHIP topics are more evenly distributed across the timecourse of analysis, while DATASET AND VARIABLE topics are more tightly clustered near the beginning of the analysis. This pattern of distribution is sensical given what we know about EDA, and is consistent with the intuition that patterns of thought during EDA are likely more iterative and situational than we think (or model).

2.2.1 by TASK

#COUNT BY TASK
ctable(x = df_coded$code_topic, 
       y = df_coded$TASK, 
       prop = "r")  

Cross-Tabulation, Row Proportions
code_topic * TASK
Data Frame: df_coded

TASK static ixn Total
code_topic
PROCESS 92 (57.5%) 68 (42.5%) 160 (100.0%)
DATASET 100 (56.8%) 76 (43.2%) 176 (100.0%)
VARIABLE 77 (63.1%) 45 (36.9%) 122 (100.0%)
RELATIONSHIP 134 (47.2%) 150 (52.8%) 284 (100.0%)
Total 403 (54.3%) 339 (45.7%) 742 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>% 
  group_by(code_topic, TASK) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(code_topic))) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPICS by TASK",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal() 

# + theme(legend.position = "blank")

2.2.2 by TASK and DATASET

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>%
  group_by(code_topic, TASK,DATASET) %>%
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK FACET DATASET
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(code_topic))) +
  facet_wrap(df_summary$DATASET) +
  geom_col() +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") +
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPICS by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal()

# + theme(legend.position = "blank")

2.2.3 by PARTICIPANT

#COUNT BY PARTICIPANT 
ctable(x = df_coded$PNUM, 
       y = df_coded$code_topic, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * code_topic
Data Frame: df_coded

code_topic PROCESS DATASET VARIABLE RELATIONSHIP Total
PNUM
P6 6 (20.7%) 5 (17.2%) 5 (17.2%) 13 (44.8%) 29 (100.0%)
P9 19 (18.4%) 36 (35.0%) 23 (22.3%) 25 (24.3%) 103 (100.0%)
P10 8 (18.6%) 11 (25.6%) 8 (18.6%) 16 (37.2%) 43 (100.0%)
P2 3 (10.7%) 8 (28.6%) 16 (57.1%) 1 ( 3.6%) 28 (100.0%)
P4 10 (17.9%) 6 (10.7%) 11 (19.6%) 29 (51.8%) 56 (100.0%)
P12 21 (24.1%) 28 (32.2%) 17 (19.5%) 21 (24.1%) 87 (100.0%)
P13 41 (46.6%) 7 ( 8.0%) 8 ( 9.1%) 32 (36.4%) 88 (100.0%)
P5 6 ( 7.3%) 18 (22.0%) 18 (22.0%) 40 (48.8%) 82 (100.0%)
P7 14 (29.2%) 14 (29.2%) 7 (14.6%) 13 (27.1%) 48 (100.0%)
P8 10 (19.6%) 7 (13.7%) 3 ( 5.9%) 31 (60.8%) 51 (100.0%)
P3 7 (13.0%) 6 (11.1%) 2 ( 3.7%) 39 (72.2%) 54 (100.0%)
P1 8 (32.0%) 4 (16.0%) 0 ( 0.0%) 13 (52.0%) 25 (100.0%)
P11 7 (14.6%) 26 (54.2%) 4 ( 8.3%) 11 (22.9%) 48 (100.0%)
Total 160 (21.6%) 176 (23.7%) 122 (16.4%) 284 (38.3%) 742 (100.0%)
#TOPICS by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_topic), data = df_coded) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal())

ggsave(p, file="figures/topics_by_count.png")



# #TOPICS by PARTICPANT facet TASK
# gf_bar( PNUM ~., fill = ~ fct_rev(code_topic), data = df_coded) %>% 
#   gf_facet_grid(DATASET~TASK) + 
#   scale_fill_brewer(type="qual", palette = 3) +
#   labs(
#     title = "Utterances by Participant, Dataset and Task",
#     subtitle = "",
#     x = "number of coded utterances",
#     y = "participant",
#     fill = "DATASET"
#   )

2.2.4 by TIME

#HISTOGRAMS BY TASK
ggplot(df_time, aes(x = relative_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density.., fill = fct_rev(code_topic), color = fct_rev(code_topic))) + 
  geom_density()+
  facet_grid(df_time$code_topic ~ df_time$TASK) +
  scale_fill_brewer(type="qual", palette = 3) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterance over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
    fill = "Topic"
  ) + theme_minimal() + theme(legend.position = "blank")

#DOTPLOT
(p <- ggplot(df_time, aes(x=relative_time, y = PNUM, color=fct_rev(code_topic))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Task",
    color = "Topic"
  )) 

#DOTPLOT
ggplot(df_time, aes(x=relative_time, y = fct_rev(TASK), color=fct_rev(code_topic))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time$PNUM) +
  # facet_grid(df_time$TASK ~ df_time$DATASET) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
) 

ggsave(p, file="figures/topics_in_time.png")
## Saving 7 x 5 in image

2.3 [DETAIL of] Utterances

NEXT we explore the distribution of specific detail utterances across Analysis Task, Dataset, Participant and Time.

RQ: What specific things did participants talk aloud during EDA? Are there any details folks only mention during static(v)interactive, or nominal(v)numeric tasks? Any substantial changes in proportion by TASK or DATASET?

Answer:

2.3.1 DETAIL—PROCESS

#PREP DATA FRAMES
df_process <- df_coded %>% 
  filter(code_topic=="PROCESS") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_process <- df_time %>% 
  filter(code_topic=="PROCESS") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_process %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_dataset <- df_process %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs( title = "PROCESS Utterances by TASK",
        subtitle = "",
        caption = "weak to moderate difference in PROCESS utterances by TASK, \n but these do not seem substantial when broken into the two categories",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_dataset, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs( title = "PROCESS Utterances by DATASET",
        subtitle = "",
        caption = "much more substantial differences in PROCESS utterances by DATASET, \n consistent with intution Ps had more to say about the numeric (vs) nominal outcome variable",
        x= "DATASET", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_process, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_process$TASK) +
  scale_color_brewer(type="seq", palette = "PuRd") +
  theme_minimal() + labs(
    title = "PROCESS Utterances by timecourse of Task",
    caption = "appear randomly distributed through time \n expected and reasonable given PROCESS utterances are meta-level",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_process, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_process$TASK ~ df_time_process$code_detail ) +
  scale_fill_brewer(type="seq", palette = "PuRd") +
  theme_minimal() + labs(
    title = "PROCESS Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

#PROCESSES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_process) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = "PuRd") +
  labs(
    title = "PROCESS Utterances by Participant and Task",
    subtitle = "",
    caption = "TODO explore P13 representation comments",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()
)

ggsave(p, file="figures/process_counts.png")

2.3.2 DETAIL—DATASET

#PREP DATA FRAMES
df_dataset <- df_coded %>% 
  filter(code_topic=="DATASET") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_dataset <- df_time %>% 
  filter(code_topic=="DATASET") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_dataset %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_dataset <- df_dataset %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 4) +
  labs( title = "DATASET Utterances by TASK",
        subtitle = "",
        caption = "notable decrease MISSING DATA utterances in IXN \n unsure what might explain this, explore at individual level",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_dataset, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 4) +
  labs( title = "DATASET Utterances by DATASET",
        subtitle = "",
        caption = "minor differences by DATASET  reasonable given DATASET \n  represenations are typically tabluar (data dictionary, describe, head/tail)",
        x= "DATASET", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_dataset, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_dataset$TASK) +
  scale_color_brewer(type="seq", palette = 4) +
  theme_minimal() + labs(
    title = "DATASET Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    caption = "notable sparsity in center of timecourse, reasonable as EDA normative behavior \n is to consider dataframe shape and missing data at the start of an analysis",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_dataset, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_dataset$TASK ~ df_time_dataset$code_detail ) +
  scale_fill_brewer(type="seq", palette = 4) +
  theme_minimal() + labs(
    title = "DATASET Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
        caption = "sensical that the most uniformly distributed detail code is missing data \n as this can be discovered via graphing",
  ) + theme_minimal() + theme(legend.position = "blank")

#DATASET UTTERANCES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_dataset) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = 4) +
  labs(
    title = "DATASET Utterances by Participant and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC",
    caption = "P11, P12 contribute to MISSING DATA \n P9 contributes largely to variable metadata \n INVESTIGATE FURTHER",
  ) + theme_minimal()
)

ggsave(p, file="figures/dataset_counts.png")

2.3.3 DETAIL—VARIABLE

#PREP DATA FRAMES
df_variable <- df_coded %>% 
  filter(code_topic=="VARIABLE") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_variable <- df_time %>% 
  filter(code_topic=="VARIABLE") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_variable %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_dataset <- df_variable %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 5) +
  labs( title = "VARIABLE Utterances by TASK",
        subtitle = "",
        caption = "notably more RANGE obs in STATIC than INTERACTIVE \n TODO consider collapsing distribution-variance ",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_dataset, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 5) +
  labs( title = "VARIABLE Utterances by DATASET",
        caption = "Notably more SHAPE in SPACE than HAPPINESS \n notably fewer RANGE in SPACE than HAPPINESS \n TODO are shape and range normalized across variable types? ",
        subtitle = "",
        x= "DATASET", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_variable, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_variable$TASK) +
  scale_color_brewer(type="seq", palette = 5) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    caption = "IXN appears more BIMODAL than STATIC where the distribution is more uniform",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_variable, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_variable$TASK ~ df_time_variable$code_detail ) +
  scale_fill_brewer(type="seq", palette = 5) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

#VARIABLE UTTERANCES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_variable) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = 5) +
  labs(
    title = "VARIABLE Utterances by Participant and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC",
    caption = "substantial individual differences, most everyone made some comments \n at some point about distribution shape, but \n discussion of outliers, variance and range was more idiosyncratic \n TODO INVESTIGATE P4 STATIC RANGE very high",
  ) + theme_minimal()
)

ggsave(p, file="figures/variable_counts.png")

2.3.4 DETAIL—RELATIONSHIP

#PREP DATA FRAMES
df_relationship <- df_coded %>% 
  filter(code_topic=="RELATIONSHIP") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_relationship <- df_time %>% 
  filter(code_topic=="RELATIONSHIP") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,relative_time,code_detail)

df_summary_task <- df_relationship %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())

df_summary_relationship <- df_relationship %>% 
  group_by(code_detail, DATASET) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary_task, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 3) +
  labs( title = "RELATIONSHIP Utterances by TASK",
        subtitle = "",
        caption = "TODO think about proportion of existence and strength/direction",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS BY DATASET
ggplot(df_summary_relationship, aes(x = DATASET, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="seq", palette = 3) +
  labs( title = "RELATIONSHIP Utterances by DATASET",
        subtitle = "",
        caption = "substantial differences by DATASET, \n consistent with pattern of results with VARIABLE utterances \n perhaps due to sparsity of knowledge of analysis of nominal X nominal relationships, and tool coverage",
        x= "DATASET", y = "count") + theme_minimal() 

gf_bar( ~code_detail, fill = ~code_detail, data = df_relationship) %>% 
  gf_facet_grid(TASK ~ DATASET) + labs(
    caption = "TODO CONSIDER THIS",
  )

#DETAILS DOTPLOT
ggplot(df_time_relationship, aes(x=relative_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_relationship$TASK) +
  scale_color_brewer(type="seq", palette = 3) +
  theme_minimal() + labs(
    title = "RELATIONSHIP Utterances by timecourse of Task",
    caption = "uniformly distributed, as expected \n may see variance if dimension of HYPOTHESIS vs OBSERVATION was coded",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_relationship, aes(x = relative_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_relationship$TASK ~ df_time_relationship$code_detail ) +
  scale_fill_brewer(type="seq", palette = 3) +
  theme_minimal() + labs(
    title = "RELATIONSHIP Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

#RELATIONSHIP UTTERANCES by PARTICPANT facet TASK
(p <- gf_bar( PNUM ~., fill = ~ fct_rev(code_detail), data = df_relationship) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="seq", palette = 3) +
  labs(
    title = "RELATIONSHIP Utterances by Participant and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()
)

ggsave(p, file="figures/relationship_counts.png")

3 REPRESENTATIONS

3.1 Representation Groups

How many representations were created?

print("BY TASK")

[1] “BY TASK”

ctable(df_coded$reps_group, df_coded$TASK,
    prop = "t",
    plain.ascii = FALSE)

3.1.1 Cross-Tabulation, Total Proportions

3.1.1.1 reps_group * TASK

Data Frame: df_coded

TASK static ixn Total
reps_group
barplot 8 ( 1.1%) 8 ( 1.1%) 16 ( 2.2%)
columns 4 ( 0.5%) 0 ( 0.0%) 4 ( 0.5%)
columns_data_dictionary 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary 27 ( 3.6%) 29 ( 3.9%) 56 ( 7.5%)
data_dictionary_dataframe 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary_describe 9 ( 1.2%) 0 ( 0.0%) 9 ( 1.2%)
dataframe 50 ( 6.7%) 26 ( 3.5%) 76 ( 10.2%)
dataframe_describe 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_heatmap 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
dataframe_pairplot 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_profile 1 ( 0.1%) 2 ( 0.3%) 3 ( 0.4%)
describe 11 ( 1.5%) 12 ( 1.6%) 23 ( 3.1%)
describe_profile 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
double-profiler 16 ( 2.2%) 7 ( 0.9%) 23 ( 3.1%)
heatmap 12 ( 1.6%) 7 ( 0.9%) 19 ( 2.6%)
hist 6 ( 0.8%) 0 ( 0.0%) 6 ( 0.8%)
info 9 ( 1.2%) 4 ( 0.5%) 13 ( 1.8%)
lineplot 36 ( 4.9%) 0 ( 0.0%) 36 ( 4.9%)
Multi-view Chart 8 ( 1.1%) 51 ( 6.9%) 59 ( 8.0%)
none 24 ( 3.2%) 19 ( 2.6%) 43 ( 5.8%)
pairplot 38 ( 5.1%) 12 ( 1.6%) 50 ( 6.7%)
profile 47 ( 6.3%) 59 ( 8.0%) 106 ( 14.3%)
python 37 ( 5.0%) 23 ( 3.1%) 60 ( 8.1%)
scatterplot 49 ( 6.6%) 79 (10.6%) 128 ( 17.3%)
stripplot 6 ( 0.8%) 0 ( 0.0%) 6 ( 0.8%)
Total 403 (54.3%) 339 (45.7%) 742 (100.0%)
print("BY DATASET")

[1] “BY DATASET”

ctable(df_coded$reps_group, df_coded$DATASET,
    prop = "t",
    plain.ascii = FALSE)

3.1.2 Cross-Tabulation, Total Proportions

3.1.2.1 reps_group * DATASET

Data Frame: df_coded

DATASET happiness space Total
reps_group
barplot 3 ( 0.4%) 13 ( 1.8%) 16 ( 2.2%)
columns 1 ( 0.1%) 3 ( 0.4%) 4 ( 0.5%)
columns_data_dictionary 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
data_dictionary 22 ( 3.0%) 34 ( 4.6%) 56 ( 7.5%)
data_dictionary_dataframe 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary_describe 7 ( 0.9%) 2 ( 0.3%) 9 ( 1.2%)
dataframe 34 ( 4.6%) 42 ( 5.7%) 76 ( 10.2%)
dataframe_describe 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
dataframe_heatmap 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_pairplot 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_profile 0 ( 0.0%) 3 ( 0.4%) 3 ( 0.4%)
describe 15 ( 2.0%) 8 ( 1.1%) 23 ( 3.1%)
describe_profile 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
double-profiler 5 ( 0.7%) 18 ( 2.4%) 23 ( 3.1%)
heatmap 14 ( 1.9%) 5 ( 0.7%) 19 ( 2.6%)
hist 0 ( 0.0%) 6 ( 0.8%) 6 ( 0.8%)
info 9 ( 1.2%) 4 ( 0.5%) 13 ( 1.8%)
lineplot 36 ( 4.9%) 0 ( 0.0%) 36 ( 4.9%)
Multi-view Chart 37 ( 5.0%) 22 ( 3.0%) 59 ( 8.0%)
none 27 ( 3.6%) 16 ( 2.2%) 43 ( 5.8%)
pairplot 44 ( 5.9%) 6 ( 0.8%) 50 ( 6.7%)
profile 55 ( 7.4%) 51 ( 6.9%) 106 ( 14.3%)
python 16 ( 2.2%) 44 ( 5.9%) 60 ( 8.1%)
scatterplot 103 (13.9%) 25 ( 3.4%) 128 ( 17.3%)
stripplot 0 ( 0.0%) 6 ( 0.8%) 6 ( 0.8%)
Total 431 (58.1%) 311 (41.9%) 742 (100.0%)

3.2 TOPICS + REPRESENTATIONS

df <- df_coded 

ctable(df$reps_group, df$code_topic,
    prop = "t",
    plain.ascii = FALSE)

3.2.1 Cross-Tabulation, Total Proportions

3.2.1.1 reps_group * code_topic

Data Frame: df

code_topic PROCESS DATASET VARIABLE RELATIONSHIP Total
reps_group
barplot 5 ( 0.7%) 1 ( 0.1%) 2 ( 0.3%) 8 ( 1.1%) 16 ( 2.2%)
columns 0 ( 0.0%) 2 ( 0.3%) 1 ( 0.1%) 1 ( 0.1%) 4 ( 0.5%)
columns_data_dictionary 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary 5 ( 0.7%) 45 ( 6.1%) 1 ( 0.1%) 5 ( 0.7%) 56 ( 7.5%)
data_dictionary_dataframe 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary_describe 0 ( 0.0%) 1 ( 0.1%) 7 ( 0.9%) 1 ( 0.1%) 9 ( 1.2%)
dataframe 18 ( 2.4%) 37 ( 5.0%) 2 ( 0.3%) 19 ( 2.6%) 76 ( 10.2%)
dataframe_describe 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_heatmap 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
dataframe_pairplot 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_profile 1 ( 0.1%) 2 ( 0.3%) 0 ( 0.0%) 0 ( 0.0%) 3 ( 0.4%)
describe 2 ( 0.3%) 13 ( 1.8%) 5 ( 0.7%) 3 ( 0.4%) 23 ( 3.1%)
describe_profile 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
double-profiler 2 ( 0.3%) 5 ( 0.7%) 5 ( 0.7%) 11 ( 1.5%) 23 ( 3.1%)
heatmap 3 ( 0.4%) 0 ( 0.0%) 0 ( 0.0%) 16 ( 2.2%) 19 ( 2.6%)
hist 0 ( 0.0%) 0 ( 0.0%) 3 ( 0.4%) 3 ( 0.4%) 6 ( 0.8%)
info 1 ( 0.1%) 11 ( 1.5%) 0 ( 0.0%) 1 ( 0.1%) 13 ( 1.8%)
lineplot 17 ( 2.3%) 0 ( 0.0%) 1 ( 0.1%) 18 ( 2.4%) 36 ( 4.9%)
Multi-view Chart 10 ( 1.3%) 7 ( 0.9%) 14 ( 1.9%) 28 ( 3.8%) 59 ( 8.0%)
none 25 ( 3.4%) 7 ( 0.9%) 2 ( 0.3%) 9 ( 1.2%) 43 ( 5.8%)
pairplot 9 ( 1.2%) 1 ( 0.1%) 9 ( 1.2%) 31 ( 4.2%) 50 ( 6.7%)
profile 22 ( 3.0%) 16 ( 2.2%) 53 ( 7.1%) 15 ( 2.0%) 106 ( 14.3%)
python 8 ( 1.1%) 20 ( 2.7%) 12 ( 1.6%) 20 ( 2.7%) 60 ( 8.1%)
scatterplot 32 ( 4.3%) 5 ( 0.7%) 3 ( 0.4%) 88 (11.9%) 128 ( 17.3%)
stripplot 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) 6 ( 0.8%) 6 ( 0.8%)
Total 160 (21.6%) 176 (23.7%) 122 (16.4%) 284 (38.3%) 742 (100.0%)

3.3 TODO PICK UP HERE

4 MODELLING

#DEFINE DATAFRAME
df <- df_coded %>% select(pid, uid, TASK, DATASET) 
  
# #MOSAIC PLOT
# mosaic(formula = ~DATASET + TASK, 
#        data = df,
#        main = "Proportion of Utterances by TASK and DATASET", 
#        sub = "u = 734 utterance-codes",
#        labeling = labeling_values,
#        labeling_args = list(set_varnames = c(graph = "TASK",
#                             datset = "DATASET")))

4.1 UTTERANCES

How much variance in number of utterances is explained DATASET, TASK and PARTICIPANT?

4.1.1 OLS Mixed Effects Models

#DEFINE DATAFRAME
df <- df_coded %>% group_by(pid, DATASET, TASK) %>% 
  dplyr::summarise( .groups = "keep",
    n_utterances = n()
  )

#NUMBER UTTERANCES predicted by DATASET + TASK | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET + TASK")
## [1] "LMER, UTTERANCES ~ DATASET + TASK"
mm1 <- lmer(n_utterances ~ DATASET + TASK+ (1|pid), data = df)
paste("Model")
## [1] "Model"
summ(mm1)
Observations 26
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 198.45
BIC 204.74
Pseudo-R² (fixed effects) 0.12
Pseudo-R² (total) 0.68
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 35.11 4.17 8.42 20.26 0.00
DATASETspace -8.90 3.33 -2.67 11.00 0.02
TASKixn -4.24 3.33 -1.27 11.00 0.23
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 11.12
Residual 8.47
Grouping Variables
Group # groups ICC
pid 13 0.63
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm1)
## Type III Analysis of Variance Table with Satterthwaite's method
##         Sum Sq Mean Sq NumDF DenDF F value Pr(>F)  
## DATASET 512.37  512.37     1    11  7.1424 0.0217 *
## TASK    116.06  116.06     1    11  1.6179 0.2296  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm1)
## Computing profile confidence intervals ...
##                   2.5 %    97.5 %
## .sig01         5.839640 18.047250
## .sigma         5.540684 12.104462
## (Intercept)   26.986050 43.233730
## DATASETspace -15.384322 -2.425202
## TASKixn      -10.717656  2.241465
report(mm1) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## + TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.68) and the
## part related to the fixed effects alone (marginal R2) is of 0.12. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 35.11
## (95% CI [26.44, 43.78], t(21) = 8.42, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -8.90, 95% CI [-15.83, -1.98], t(21) = -2.67, p = 0.014; Std. beta = -0.61,
## 95% CI [-1.09, -0.14])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -4.24, 95% CI [-11.17, 2.69], t(21) = -1.27, p = 0.217; Std. beta = -0.29,
## 95% CI [-0.77, 0.19])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm1,  show.intercept = TRUE)

check_model(mm1)

#NUMBER UTTERANCES predicted by DATASET * TASK  | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET X TASK")
## [1] "LMER, UTTERANCES ~ DATASET X TASK"
mm2 <- lmer(n_utterances ~ DATASET * TASK + (1|pid), data = df)
paste("Model")
## [1] "Model"
summ(mm2)
Observations 26
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 192.74
BIC 200.29
Pseudo-R² (fixed effects) 0.14
Pseudo-R² (total) 0.70
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 37.57 5.37 7.00 15.55 0.00
DATASETspace -14.24 7.90 -1.80 15.55 0.09
TASKixn -9.57 7.90 -1.21 15.55 0.24
DATASETspace:TASKixn 10.67 14.32 0.74 11.00 0.47
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 11.39
Residual 8.47
Grouping Variables
Group # groups ICC
pid 13 0.64
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm2)
## Type III Analysis of Variance Table with Satterthwaite's method
##              Sum Sq Mean Sq NumDF DenDF F value Pr(>F)  
## DATASET      512.37  512.37     1    11  7.1424 0.0217 *
## TASK         116.06  116.06     1    11  1.6179 0.2296  
## DATASET:TASK  39.80   39.80     1    11  0.5549 0.4720  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm2)
## Computing profile confidence intervals ...
##                           2.5 %    97.5 %
## .sig01                 5.468708 17.569089
## .sigma                 5.540710 12.104731
## (Intercept)           27.350579 47.792279
## DATASETspace         -29.282779  0.806589
## TASKixn              -24.616112  5.473256
## DATASETspace:TASKixn -17.180459 38.513793
report(mm2) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## * TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.70) and the
## part related to the fixed effects alone (marginal R2) is of 0.14. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 37.57
## (95% CI [26.38, 48.76], t(20) = 7.00, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -14.24, 95% CI [-30.71, 2.24], t(20) = -1.80, p = 0.086; Std. beta =
## -0.98, 95% CI [-2.11, 0.15])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -9.57, 95% CI [-26.04, 6.90], t(20) = -1.21, p = 0.240; Std. beta = -0.66,
## 95% CI [-1.79, 0.47])
##   - The effect of DATASET [space] × TASK [ixn] is statistically non-significant
## and positive (beta = 10.67, 95% CI [-19.20, 40.54], t(20) = 0.74, p = 0.465;
## Std. beta = 0.73, 95% CI [-1.32, 2.79])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm2,  show.intercept = TRUE)

check_model(mm2)

4.1.2 POISSON Mixed Effects Models

# 
# #NUMBER UTTERANCES predicted by TASK + DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
# print("POISSON-MER, UTTERANCES ~ DATASET + TASK")
# pmm1 <- glmer(n_utterances ~ TASK + DATASET + (1|pid), data = df, family = "poisson")
# paste("Model")
# summ(pmm1)
# paste("Partition Variance")
# anova(pmm1)
# paste("Confidence Interval on Parameter Estimates")
# confint(pmm1)
# report(pmm1) #sanity check
# plot_model(pmm1,  show.intercept = TRUE)
# check_model(pmm1)
# 
# #NUMBER UTTERANCES predicted by TASK X DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
# print("POISSON-MER, UTTERANCES ~ DATASET X TASK")
# pmm2 <- glmer(n_utterances ~ TASK * DATASET + (1|pid), data = df, family = "poisson")
# paste("Model")
# summ(pmm2)
# paste("Partition Variance")
# anova(pmm2)
# paste("Confidence Interval on Parameter Estimates")
# confint(pmm2)
# report(pmm2) #sanity check
# plot_model(pmm2,  show.intercept = TRUE)
# check_model(pmm2)